#!/usr/bin/perl
use strict;
use POSIX;
use Cwd;
use Cwd 'abs_path';

use List::Util qw(min max);
use Getopt::Long;

use lib './scripts';
use ConservatoryUtils;
use ConservatoryTree;
use GenomeDatabase;
use MappingDatabase;
use CNSDatabase;


$|=1;

### Filenames
my $conservatoryDir=abs_path(".");
my $tmpDirRoot;
my $CNSFileName;
my $mapFileName;
my $outputCNSFileName;
my $outputMapFileName;
my $CNSSubsetFile;

my $keep_tmp=0;

## Filtering parameters
my $splitCNSSensitivity=2; ### Parameters for sensing CNS split. High sensitivity (2) low sensitivity (4)
my $minOverlapAfterSplit=0.2; ### Mandate at least this level of overlap to assign a mapping to a split CNS

my $verbose=0;
my $help=0;
my $totalToScreen;

my $break=0;
my $reconstruct=0;

GetOptions ("conservatoryDirectory=s" => \$conservatoryDir,
			"in-cns=s" => \$CNSFileName,
			"in-map=s" => \$mapFileName,
			"out-cns=s" => \$outputCNSFileName,
			"out-map=s" => \$outputMapFileName,
			"verbose" => \$verbose,
			"break" => \$break,
			"reconstruct" => \$reconstruct,
			"tmp-dir=s" => \$tmpDirRoot,
			"help" => \$help) or die ("Error in command line arguments\n");
			
			
if($help || !defined $CNSFileName || !defined $mapFileName || !defined $outputCNSFileName ) {
	print "Conservatory version 2.0.1\n\n";
	print "reconstructCNSSequences --in-cns <cnsFile> --in-map <cnsPositionMapFile> --out-cns <cnsFile> --out-map <mapFile> [--verbose] [--split <curProcess> --total-splits <total number of processes>].\n\n";
	
	exit();
}
### Load genome and tree
my $genomeDB = new GenomeDatabase($conservatoryDir);
my $conservatoryTree = new ConservatoryTree($genomeDB);

### Prepare temporary files

if(!defined $tmpDirRoot) { $tmpDirRoot = $genomeDB->getTemporaryDir(); }

my $tmpDir = $tmpDirRoot . "/S" . $$ . "/";
mkdir $tmpDir;
$genomeDB->setTemporaryDir($tmpDir);

my $tmpCNSToReconstructFileName = "$tmpDir/tmpCNSToReconstruct.txt";

print "PROGRESS: Reading CNS from $CNSFileName.\n";
my $CNSDB = new CNSDatabase($CNSFileName);
$CNSDB->orderCNSByName();

open (my $tmpCNSToReconstructFile , ">$tmpCNSToReconstructFileName") || die ("ERROR: Cannot open temporary file $tmpCNSToReconstructFileName.\n");

print "PROGRESS: Compiling CNS list...\n";
foreach my $curCNS (@{ $CNSDB->getCNSByOrder() } ) {
	print $tmpCNSToReconstructFile $curCNS->getID() ."\n";
}
close($tmpCNSToReconstructFile);

### Load just the relevant mappings for the CNSs we are working with

my $mappingDB = new MappingDatabase("grep -f $tmpCNSToReconstructFileName $mapFileName |");
unlink($tmpCNSToReconstructFileName);
my $curPos=1;

### record current directory and move to the temporary directory. This is because fastml creates a log.txt file
### In the working directory and there is no way to change that. When run in parallel this creates file conflicts...

my $curDir = getcwd();
chdir $tmpDir;
$curPos=0;
$totalToScreen = $CNSDB->getNumberOfCNSs();

if($verbose) { print "PROGRESS: Processing $totalToScreen CNSs...\n"; }
foreach my $curCNS (@{ $CNSDB->getCNSByOrder() } ) {
	### If its a super CNS, see if we can split it	
	$curPos++;

	if($curCNS->isMerged()) {
		if($break) {
			my @CNSBreakpoints = $mappingDB->findBreakPointsForCNS($curCNS, $splitCNSSensitivity);  ### Find breakpoints with high sensitivity
						
			if(@CNSBreakpoints > 2) {
				if($verbose) { print "PROGRESS: Breaking CNS (" . $curPos ."/ $totalToScreen). Breakpoints: " . join(",", @CNSBreakpoints) . ".......\n"; }
				my @assignedMappings = $mappingDB->assignMappingsToBreakpoints($curCNS, \@CNSBreakpoints, $minOverlapAfterSplit); ### Assign to split CNS, filtering low overlap
				### Now break the CNSs		
				for my $curBreakpoint (0.. ((scalar @CNSBreakpoints) - 2)) {
					my $breakLen = $CNSBreakpoints[$curBreakpoint+1]- $CNSBreakpoints[$curBreakpoint];
					my $CNSforBreakpointName = $curCNS->getID() . ".$CNSBreakpoints[$curBreakpoint]";
					print "PROGRESS: Breaking " . $curCNS->getID() . " to $CNSforBreakpointName.\n";
					### Create new sub-CNS
					my $CNSforBreakpoint = new CNS($curCNS->getRefGenome(), $CNSforBreakpointName, $curCNS->getLocus(), $curCNS->getPos() + $CNSBreakpoints[$curBreakpoint], $breakLen, $curCNS->getConservationLevel(),
												   $curCNS->getSupportingSpeciesNumber(), substr($curCNS->getSeq(),$CNSBreakpoints[$curBreakpoint], $breakLen), $curCNS->getRegulatorySeqPart());
					$CNSDB->add($CNSforBreakpoint);

					## Assign the mappings
					foreach my $curMapping (@assignedMappings) {
						### If there is an overlap, assign the deep CNS to the new subCNS.
						if($curMapping->getBreakPoint() == $curBreakpoint) {
							$mappingDB->linkMappingToCNS($curMapping, $CNSforBreakpoint);
							$curMapping->setRRP( $curMapping->getRRP() - $CNSBreakpoints[$curBreakpoint]);
						}
					}

					### and reconstruct the ancesteral seq
					my %speciesSeq = getSpeciesSequencesForCNS($CNSforBreakpoint, $mappingDB, $conservatoryTree);
					if( (scalar %speciesSeq) >= $minSpeciesForCNS) {
						if($reconstruct) {
							$CNSforBreakpoint->setSeq( $conservatoryTree->getReconstructedSequence($CNSforBreakpoint, \%speciesSeq, $minSequenceContentToConsiderReconstruction*100));
							my @CNSSpecies = sort map { $_->getSpecies() } @{ $mappingDB->getMappingsForCNS($CNSforBreakpoint) };
							$CNSforBreakpoint->setConservationLevel( $conservatoryTree->findDeepestCommonNode( \@CNSSpecies) ); 
						}
					} else {
						$CNSDB->deleteCNS($CNSforBreakpoint);
						$mappingDB->deleteCNS($CNSforBreakpoint);
					}
				}
				### Finally, delete the original CNS
				$CNSDB->deleteCNS($curCNS);
				$mappingDB->deleteCNS($curCNS);
			} else {
				### reconstruct ancesteral
				if($reconstruct) {
					my %speciesSeq = getSpeciesSequencesForCNS($curCNS, $mappingDB, $conservatoryTree);
					$curCNS->setSeq( $conservatoryTree->getReconstructedSequence($curCNS, \%speciesSeq,$minSequenceContentToConsiderReconstruction*100));
				}
			}
		} elsif($reconstruct) {
			my %speciesSeq = getSpeciesSequencesForCNS($curCNS, $mappingDB, $conservatoryTree);
			$curCNS->setSeq( $conservatoryTree->getReconstructedSequence($curCNS, \%speciesSeq, $minSequenceContentToConsiderReconstruction*100));
			my @CNSSpecies = sort map { $_->getSpecies() } @{ $mappingDB->getMappingsForCNS($curCNS) };
			$curCNS->setConservationLevel( $conservatoryTree->findDeepestCommonNode( \@CNSSpecies) ); 			
		}
	}

	if($curCNS->isAlive()) {
		if($reconstruct) {
			my @CNSSpecies = sort map { $_->getSpecies() } @{ $mappingDB->getMappingsForCNS($curCNS) };
			$curCNS->setConservationLevel( $conservatoryTree->findDeepestCommonNode( \@CNSSpecies) ); 
		}
	}

	if($verbose) { print "PROGRESS: Reconstructed $curPos/$totalToScreen...\r"; }			
}

### Filter low support
$CNSDB->updateNumberOfSupportingSpecies($mappingDB);

foreach my $curCNS (@{ $CNSDB->getCNSByOrder() }) {
	if($curCNS->getSupportingSpeciesNumber() < $minSpeciesForCNS ) { 
		if($verbose) { print "PROGRESS: Dropping CNS " . $curCNS->getID() . " for low number of species (" . $curCNS->getSupportingSpeciesNumber() . ").\n"; }

		$CNSDB->deleteCNS($curCNS);
		$mappingDB->deleteCNS($curCNS);
	}
}

if($verbose) { print "\nPROGRESS: Done. We now have " . $CNSDB->getNumberOfCNSs() . " CNSs...\n"; }

if($verbose) { print "\nPROGRESS: Done. Output files.\n"; }

########### Output files

## Go back to our working directory

chdir $curDir;
rmdir $tmpDir;
### New CNS files

$CNSDB->writeDatabase($outputCNSFileName);
$mappingDB->writeDatabase($outputMapFileName);


###########################################################################################################
###########
###########

sub getSpeciesSequencesForCNS {
	my ($CNS, $mappingDB, $conservatoryTree) = @_;
	my %speciesSequencesForCNS;

	foreach my $curMapping (@{ $mappingDB->getMappingsForCNS ($CNS) }) {
		my $speciesName = $curMapping->getSpecies();
		## replace the dot in species name which makes tree life difficult
		$speciesName =~ s/\./_/g;
		if($conservatoryTree->exists($speciesName)) {
			if($curMapping->getAbsLen() > length($speciesSequencesForCNS{ $speciesName }) ) {
				$speciesSequencesForCNS{ $speciesName } = $curMapping->getSeq();
			}
		}
	}
	return %speciesSequencesForCNS;
}

